# This script prepares the different imputed and oversampled training datasets to develop the preschool CAPP models. 
# These preschool training datasets prepared in this script will have had the following optimisation techniques applied: MICE imputation > ADASYN oversampling
# Once the data is prepared, this script needs to be immediately followed by: "Model_development_XXX.txt", where XXX is the name of the different algorithms considered. 
# The data in file "MICE_imputed_standardised_preschool_training_dataset_1185ID.csv" is found in IOWBC_imputed_data.xlsx, sheet: "Standardised MICE preschool training"
# The data in files named "MICE_imputed_oversampled_preschool_dataset_XXX.csv" were developed using the script "Data_preparation_CAPP_imputation_oversampling.txt (data can be found in XXX).
# Python version 3.6.8 was used 

# Set working directory
os.chdir("/../../")

# Imports
import os
import pandas as pd
import numpy as np


# Import datasets
data_0 = pd.read_csv("/scratch/dk2e18/Asthma_Prediction_Model/Imputation_Oversampling/MICE/MICE_imputed_standardised_preschool_training_dataset_1185ID.csv", index_col=False)
del data_0['Unnamed: 0']
data_25 = pd.read_csv("MICE_imputed_oversampled_preschool_dataset_25%.csv", index_col=False)
data_50 = pd.read_csv("MICE_imputed_oversampled_preschool_dataset_50%.csv", index_col=False)
data_100 = pd.read_csv("MICE_imputed_oversampled_preschool_dataset_100%.csv", index_col=False)
data_150 = pd.read_csv("MICE_imputed_oversampled_preschool_dataset_150%.csv", index_col=False)
data_200 = pd.read_csv("MICE_imputed_oversampled_preschool_dataset_200%.csv", index_col=False)
data_250 = pd.read_csv("MICE_imputed_oversampled_preschool_dataset_250%.csv", index_col=False)
data_300 = pd.read_csv("MICE_imputed_oversampled_preschool_dataset_300%.csv", index_col=False)


# Remove extra synthetic cases produced from each dataset and assign all training datasets to be considered for model development into data object
data = [];
data.append(data_0);
data.append(data_25.iloc[0:1229,:]);
data.append(data_50.iloc[0:1273,:]);
data.append(data_100.iloc[0:1361,:]);
data.append(data_150.iloc[0:1449,:]);
data.append(data_200.iloc[0:1537,:]);
data.append(data_250.iloc[0:1625,:]);
data.append(data_300.iloc[0:1713,:])

# Set should be indexed according to the number of datasets included in the object data. This will be used during model development to loop through each training dataset.  
set = [0,1,2,3,4,5,6,7]

# Import preschool test data, standardised against the imputed preschool training dataset - data found in IOWBC_imputed_data.xlsx, sheet: "Standardised preschool test set"
test = pd.read_csv("Preschool_MICE_standardised_test_dataset_183IDs.csv", index_col=False)
del test['Unnamed: 0']
# Split test data into features and outcome
X_test = test.drop(['Study_ID','Asthma_10YR'], axis=1)
y_test = test['Asthma_10YR']